import libraries

for graphic - plotly for data - pandas

In [1]:
# data
import pandas as pd #import library
import numpy as np
# graph
import plotly.express as px
import plotly.graph_objects as go
# sys
import sys
import os

reading the data

In [2]:
# directory path
AppPath = sys.path[0]
data_folder = AppPath + '/data'
file = 'data.csv'

# printing the ways
print(data_folder)
print(AppPath)
# to raw data folder
os.chdir(data_folder)
# import the data
df = pd.read_csv(file, sep='~', parse_dates=True)
# converting dtypes
downcast = 'integer'
errors = 'coerce'
# to int
df['id']= pd.to_numeric(df['id'], downcast=downcast, errors=errors)
df['max_id'] = pd.to_numeric(df['max_id'], downcast=downcast, errors=errors)
df['min_id'] = pd.to_numeric(df['min_id'], downcast=downcast, errors=errors)
df['is_end'] = pd.to_numeric(df['is_end'], downcast=downcast, errors=errors)
df['is_warning'] = pd.to_numeric(df['is_warning'], downcast=downcast, errors=errors)
df['time_delta_min'] = df['time_delta_min'].astype('int64',errors='ignore')
# dt
df['date'] = pd.to_datetime(df['date'])
# float

df
D:\stat\telegram_trevoga/data
D:\stat\telegram_trevoga
Out[2]:
id last_end unique_warning index _ datetime message ttl_period action region ... month war_day war_week weekday_num weekday is_end is_warning min_id max_id time_delta_min
0 350 False True 1 Message 2022-04-21 16:47:12+00:00 🚨ОДЕССА И ОБЛАСТЬ — ВОЗДУШНАЯ ТРЕВОГА!\n\nВсе ... NaN NaN Одеська ... April 57 8 3 Thursday 0 1 350.0 NaN NaN
1 349 True False 2 Message 2022-04-21 14:55:54+00:00 Отбой тревоги✅\n\nКанал тревог: https://t.me/+... NaN NaN Одеська ... April 57 8 3 Thursday 1 0 NaN 349.0 70.0
2 347 False True 3 Message 2022-04-21 13:45:43+00:00 🚨ОДЕССА И ОБЛАСТЬ — ВОЗДУШНАЯ ТРЕВОГА!\n\nВсе ... NaN NaN Одеська ... April 57 8 3 Thursday 0 1 347.0 NaN NaN
3 346 True False 4 Message 2022-04-20 20:09:27+00:00 Отбой тревоги✅ NaN NaN Одеська ... April 56 8 2 Wednesday 1 0 NaN 346.0 20.0
4 345 False True 5 Message 2022-04-20 19:49:10+00:00 🚨ОДЕССА И ОБЛАСТЬ — ВОЗДУШНАЯ ТРЕВОГА!\n\nВсе ... NaN NaN Одеська ... April 56 8 2 Wednesday 0 1 345.0 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9040 8 False False 1011 Message 2022-02-27 17:40:33+00:00 ‼️‼️‼️Увага! Оголошено тривогу, усім пройти в ... NaN NaN Донецька ... February 4 0 6 Sunday 0 1 4.0 NaN NaN
9041 7 False False 1012 Message 2022-02-27 17:40:30+00:00 ‼️‼️‼️Увага! Оголошено тривогу, усім пройти в ... NaN NaN Донецька ... February 4 0 6 Sunday 0 1 4.0 NaN NaN
9042 6 False False 1013 Message 2022-02-27 17:40:27+00:00 ‼️‼️‼️Увага! Оголошено тривогу, усім пройти в ... NaN NaN Донецька ... February 4 0 6 Sunday 0 1 4.0 NaN NaN
9043 5 False False 1014 Message 2022-02-27 17:40:24+00:00 ‼️‼️‼️Увага! Оголошено тривогу, усім пройти в ... NaN NaN Донецька ... February 4 0 6 Sunday 0 1 4.0 NaN NaN
9044 4 False True 1015 Message 2022-02-27 17:40:18+00:00 ‼️‼️‼️Увага! Оголошено тривогу, усім пройти в ... NaN NaN Донецька ... February 4 0 6 Sunday 0 1 4.0 NaN NaN

9045 rows × 25 columns

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9045 entries, 0 to 9044
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              9045 non-null   int16         
 1   last_end        9045 non-null   bool          
 2   unique_warning  9045 non-null   bool          
 3   index           9045 non-null   int64         
 4   _               9045 non-null   object        
 5   datetime        9045 non-null   object        
 6   message         9045 non-null   object        
 7   ttl_period      0 non-null      float64       
 8   action          0 non-null      float64       
 9   region          9045 non-null   object        
 10  command_center  9045 non-null   object        
 11  time            9045 non-null   object        
 12  date            9045 non-null   datetime64[ns]
 13  day             9045 non-null   int64         
 14  month_num       9045 non-null   int64         
 15  month           9045 non-null   object        
 16  war_day         9045 non-null   int64         
 17  war_week        9045 non-null   int64         
 18  weekday_num     9045 non-null   int64         
 19  weekday         9045 non-null   object        
 20  is_end          9045 non-null   int8          
 21  is_warning      9045 non-null   int8          
 22  min_id          4971 non-null   float64       
 23  max_id          4074 non-null   float64       
 24  time_delta_min  3103 non-null   float64       
dtypes: bool(2), datetime64[ns](1), float64(5), int16(1), int64(6), int8(2), object(8)
memory usage: 1.4+ MB
In [4]:
print('Number of starts of warnings: ',df[df['unique_warning'] == True].shape[0])
print('Numberof ends of warnings: ', df[df['last_end'] == True].shape[0])
Number of starts of warnings:  3105
Numberof ends of warnings:  3105
Selecting the needed region
In [5]:
# take from 1 march
df_region = df[(df['region'] == 'Київська') & (df['date']>= pd.to_datetime('3/1/2022'))]
# global var
region = 'Kyiv'
last_date = df_region['datetime'].max()

df_region
Out[5]:
id last_end unique_warning index _ datetime message ttl_period action region ... month war_day war_week weekday_num weekday is_end is_warning min_id max_id time_delta_min
2777 1364 True False 0 Message 2022-04-21 18:04:37+00:00 Київська обл.\nВідбій тривоги - будьте обережні NaN NaN Київська ... April 57 8 3 Thursday 1 0 NaN 1364.0 66.0
2778 1363 False False 1 Message 2022-04-21 18:02:29+00:00 Київ\nВідбій тривоги - будьте обережні NaN NaN Київська ... April 57 8 3 Thursday 1 0 NaN 1364.0 NaN
2779 1362 False False 2 Message 2022-04-21 17:00:44+00:00 Київська обл.\nТривога - Усі в укриття NaN NaN Київська ... April 57 8 3 Thursday 0 1 1361.0 NaN NaN
2780 1361 False True 3 Message 2022-04-21 16:58:02+00:00 Київ\nТривога - Усі в укриття NaN NaN Київська ... April 57 8 3 Thursday 0 1 1361.0 NaN NaN
2781 1360 True False 4 Message 2022-04-21 14:58:46+00:00 Київська обл.\nВідбій тривоги - будьте обережні NaN NaN Київська ... April 57 8 3 Thursday 1 0 NaN 1360.0 47.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4093 25 True False 1337 Message 2022-03-01 07:24:39+00:00 Київ, відбій тривоги🙏‼️ NaN NaN Київська ... March 6 0 1 Tuesday 1 0 NaN 25.0 139.0
4094 24 False False 1338 Message 2022-03-01 06:54:46+00:00 ‼️Повітряна тривога, Київ! NaN NaN Київська ... March 6 0 1 Tuesday 0 1 23.0 NaN NaN
4095 23 False True 1339 Message 2022-03-01 05:05:08+00:00 ‼️ Повітряна тривога, Київ! NaN NaN Київська ... March 6 0 1 Tuesday 0 1 23.0 NaN NaN
4096 22 True False 1340 Message 2022-03-01 00:43:09+00:00 ❕Відбій повітряної тривоги! NaN NaN Київська ... March 6 0 1 Tuesday 1 0 NaN 22.0 13.0
4097 21 False True 1341 Message 2022-03-01 00:29:17+00:00 Київ, повітряна тривога! NaN NaN Київська ... March 6 0 1 Tuesday 0 1 21.0 NaN NaN

1321 rows × 25 columns

In [6]:
last_date
Out[6]:
'2022-04-21 18:04:37+00:00'

Execution of calculations

intensity of warnings by days

In [7]:
# selecting 
warday = df_region[['month_num','month','war_day', 'unique_warning']]
# grouping
warday = warday.groupby(by=['month_num','month','war_day']).sum()
warday = warday.reset_index()
# plotting
warday_line =  px.line(warday, x='war_day', y="unique_warning", color="month",
                      title=f'intensity of warnings by days in {region}')
warday_line.show()
1020304050246810
monthMarchAprilintensity of warnings by days in Kyivwar_dayunique_warning

The absolute number of warnings per weekday

In [8]:
# selecting
weekday = df_region[['weekday_num', 'weekday', 'unique_warning', 'month', 'month_num']]
# grouping 
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday']).sum()
weekday = weekday.reset_index()
# printing
weekday_bar =  px.bar(weekday, x="weekday", y="unique_warning", color="month", barmode="group", 
                     title=f'The absolute number of warnings per weekday in {region}')
weekday_bar.show()
MondayTuesdayWednesdayThursdayFridaySaturdaySunday05101520253035
monthMarchAprilThe absolute number of warnings per weekday in Kyivweekdayunique_warning

median number of warnings per weekday

We count the median

In [9]:
# selecting
weekday = df_region[['weekday_num', 'weekday', 'unique_warning', 'month', 'month_num', 'war_day']] # addedwarday
# grouping 1
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday', 'war_day']).sum()
weekday = weekday.reset_index()
# grouping 2
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday']).median()
weekday = weekday.reset_index()
weekday
# printing
weekday_bar =  px.bar(weekday, x="weekday", y="unique_warning", color="month", barmode="group",
                     title=f'median number of warnings per weekday in {region}')
weekday_bar.show()
MondayTuesdayWednesdayThursdayFridaySaturdaySunday0246810
monthMarchAprilmedian number of warnings per weekday in Kyivweekdayunique_warning

distribution of warnings per weekday per months

In [10]:
# selecting
weekday = df_region[['weekday_num', 'weekday', 'unique_warning', 'month', 'month_num', 'war_day']] # addedwarday
# grouping 1
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday', 'war_day']).sum()
weekday = weekday.reset_index()
# printing
weekday_boxplot = px.box(weekday, x='month' ,y="unique_warning", color="weekday", 
                        title=f'distribution of warnings per weekday per months in {region}')
weekday_boxplot.show()
MarchApril246810
weekdayMondayTuesdayWednesdayThursdayFridaySaturdaySundaydistribution of warnings per weekday per months in Kyivmonthunique_warning

duration of warnings

absolute duration by days of war

In [11]:
# selecting 
warday = df_region[['month','war_day', 'time_delta_min']]
# grouping
warday = warday.groupby(by=['war_day', 'month']).sum()
warday = warday.reset_index()
# plotting
warday_line =  px.line(warday, x='war_day', y="time_delta_min", color="month")
warday_line.show()
1020304050020040060080010001200
monthMarchAprilwar_daytime_delta_min

median and mean duration of one warning by days of war

In [12]:
# selecting 
warday = df_region[['month','war_day', 'time_delta_min']].copy()
# grouping median
warday_median = warday.groupby(by=['war_day', 'month']).median()
warday_median['measurement'] = 'Median'
# grouping mean
warday_mean = warday.groupby(by=['war_day', 'month']).mean()
warday_mean['measurement'] = 'Mean'
# union
warday = pd.concat([warday_median, warday_mean])
warday = warday.reset_index()
# plotting
warday_line_mean =  px.line(warday, x='war_day', y="time_delta_min", color="measurement",
                           title=f'median and mean duration of one warning by days of war in {region}')
warday_line_mean.show()
1020304050050100150200250300350400
measurementMedianMeanmedian and mean duration of one warning by days of war in Kyivwar_daytime_delta_min

median time of warnings per weekday

In [13]:
# selecting
weekday = df_region[['weekday_num', 'weekday', 'time_delta_min', 'month', 'month_num']] # addedwarday
# grouping 1
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday']).median()
weekday = weekday.reset_index()
# printing
weekday_bar =  px.bar(weekday, x="weekday", y="time_delta_min", color="month", barmode="group",
                     title=f'median time of warnings per weekday in {region}')
weekday_bar.show()
MondayTuesdayWednesdayThursdayFridaySaturdaySunday010203040506070
monthMarchAprilmedian time of warnings per weekday in Kyivweekdaytime_delta_min

distribution of time in warnings per weekday per months

In [14]:
# selecting
weekday = df_region[['weekday_num', 'weekday', 'time_delta_min', 'month', 'month_num', 'war_day']] # addedwarday
# grouping 1
weekday = weekday.groupby(by=['month_num', 'month','weekday_num', 'weekday', 'war_day']).sum()
weekday = weekday.reset_index()
weekday
# printing
weekday_boxplot = px.box(weekday, x='month' ,y="time_delta_min", color="weekday", 
                        title=f'distribution of time in warnings per weekday per months in {region}')
weekday_boxplot.show()
MarchApril020040060080010001200
weekdayMondayTuesdayWednesdayThursdayFridaySaturdaySundaydistribution of time in warnings per weekday per months in Kyivmonthtime_delta_min

distribution of time of 1 warning per weekday per months

In [15]:
# selecting
weekday = df_region[['weekday_num', 'weekday', 'time_delta_min', 'month', 'month_num', 'war_day']] # added warday
weekday = weekday[pd.isna(weekday['time_delta_min']) == False].sort_values(by=['month_num', 'month', 'weekday_num', 'weekday'])
# grouping 1
weekday
# printing
weekday_boxplot = px.box(weekday, x='month' ,y="time_delta_min", color="weekday", 
                        title=f'distribution of time of 1 warning per weekday per months {region}')
weekday_boxplot.show()
MarchApril0200400600800
weekdayMondayTuesdayWednesdayThursdayFridaySaturdaySundaydistribution of time of 1 warning per weekday per months Kyivmonthtime_delta_min

Histogram of times of 1 warning

In [16]:
# selecting
month_dist = df_region[['weekday_num', 'weekday', 'time_delta_min', 'month', 'month_num', 'war_day']] # added warday
month_dist = month_dist[pd.isna(month_dist['time_delta_min']) == False].sort_values(by=['month_num', 'month'])
# printing
hist = px.histogram(month_dist, x="time_delta_min", color="month", nbins=100, marginal="box" , facet_row="month",
                   title=f'histogram of time of 1 warning per months in {region}')
hist.show()
01002003004005006007008009000510152005101520
monthMarchAprilhistogram of time of 1 warning per months in Kyivtime_delta_mincountcountmonth=Aprilmonth=March

Kaplan Meier's probability of warnings

In [17]:
# function of counting the Kaplan Meier
def kaplan_meier(df, duration_column='', event_column=''):
    df = df[df[duration_column] > 0]
    durations = df.sort_values(duration_column)[duration_column].unique()
    # Initialise the table
    columns = ['duration', 'n_at_risk', 'n_events', 
               'survival_probability']
    km = pd.DataFrame(columns=columns, dtype=np.number)
    km = km.append(pd.DataFrame([[0, df.shape[0], 0, 1]], 
                                    columns=columns))
    # Calculate survival probability for each duration
    for i, t in enumerate(durations):
        n = np.sum(df[duration_column]>=t)
        d = np.sum((df[duration_column]==t) & (df[event_column]==True))
        s = (1 - d / n) * km.loc[i, 'survival_probability']
        km = km.append(pd.DataFrame([[t, n, d, s]], 
                                    index=[i+1],
                                    columns=columns))
#         km = km[(pd.isna(km['survival_probability']) == False)]
    return km
In [18]:
# seleting necessary values
# per all time
df_km = df_region[['time_delta_min', 'last_end']]
df_km = df_km[(df_km['last_end'] == True) & (df_km['time_delta_min'] > 0)]
df_km['survival_probability'] = 1
df_km['event'] = 1
df_km.index = range(0, df_km.shape[0])

# per month
df_km_march = df_region[df_region['month']=='March']
df_km_april = df_region[df_region['month']=='April']

# counting the Kaplan Meier probability
km_march = kaplan_meier(df_km_march, duration_column='time_delta_min', event_column='last_end')
km_april = kaplan_meier(df_km_april, duration_column='time_delta_min', event_column='last_end')
# creating the plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=km_march['duration'], 
                         y=km_march['survival_probability'],
                    line_shape='vh', name=f'March'))
fig.add_trace(go.Scatter(x=km_april['duration'], 
                         y=km_april['survival_probability'],
                    line_shape='hv', name=f'April'))
fig.add_trace(go.Scatter(x=np.array([0, km_march['duration'].max()]), y=np.array([0.5, 0.5]), name="median",
                    line_shape='linear', line_color='rgb(0,176,246)'))
# fig.update_layout(legend=dict(y=0.5, traceorder='reversed', font_size=16))
fig.update_layout(title=f'Kaplan Meiers probability of warnings in {region}', 
                  xaxis_title='duration of warnings (mins)',
                  yaxis_title='probaility',)
fig.show()
020040060080000.20.40.60.81
MarchAprilmedianKaplan Meiers probability of warnings in Kyivduration of warnings (mins)probaility

Scatter of durations and number of warnings

In [19]:
# selecting
df_regr = df_region[['month_num', 'month', 'war_day','unique_warning', 'time_delta_min']]

# grouping 1
wardays_warnings = df_regr[['month_num', 'month', 
                            'war_day','unique_warning']].groupby(by=['month_num', 'month', 'war_day']).sum()
# grouping 2
wardays_durations = df_regr[['month_num', 'month', 
                            'war_day','time_delta_min']].groupby(by=['month_num', 'month', 'war_day']).sum()
# joinings
df_durations_warnings = pd.merge(left=wardays_warnings, right=wardays_durations,
                                         left_index=True, right_index=True)
df_durations_warnings = df_durations_warnings.reset_index()
df_durations_warnings['duration of warnings per day (mins)'] = df_durations_warnings['time_delta_min']
df_durations_warnings['number of warnings per day'] = df_durations_warnings['unique_warning']
# printing the figure
fig = px.scatter(df_durations_warnings, x='number of warnings per day', y='duration of warnings per day (mins)',
                 color='month', title=f'scatter of durations and number of warnings in {region}')
fig.show()
246810020040060080010001200
monthMarchAprilscatter of durations and number of warnings in Kyivnumber of warnings per dayduration of warnings per day (mins)

Scope

Now we can see that situation in Kyiv with air warnings in April is better than was in March. It has relationship to news from North that Russian army lossed the battle of Kyiv and overdislocated to South and East of Ukraine for attacking the OUF zone. For our hopes Russians will suck our cock